Esse documento tem como objetivo tentar entender os dados relacionados ao COVID-19
#Lendo dados referente ao COVID-19
global_data <- read.csv2("dataset/WHO-COVID-19-global-data.csv", header = TRUE, sep = ",") %>%
clean_names() %>%
arrange(date_reported, country)
#Transformando para formato de data
global_data <- global_data %>%
mutate(date_reported = as.Date(date_reported, format = "%Y-%m-%d"))
#Pegando a data inicial e a última data dos dados
start_date <- min(global_data$date_reported)
last_date <- max(global_data$date_reported)
population <- read.csv2("dataset/population_by_country_2020.csv", header = TRUE, sep = ",") %>%
clean_names() %>%
rename(country = country_or_dependency) %>%
mutate(country = as.character(country)) %>%
mutate(country = if_else(country == "United States", "United States of America", country)) %>%
mutate(country = if_else(country == "United Kingdom", "The United Kingdom", country)) %>%
mutate(country = if_else(country == "Iran", "Iran (Islamic Republic of)", country))
#Juntando os dados da covid com a população mundial
global_data <- merge(global_data, population, by = "country")
global_data <- global_data %>%
mutate(cumulative_cases = if_else(new_cases < 0, -new_cases + cumulative_cases, cumulative_cases)) %>%
mutate(rate_deaths_population = as.numeric(sprintf("%0.4f", cumulative_deaths*100/population_2020))) %>%
mutate(rate_cases_population = as.numeric(sprintf("%0.4f", cumulative_cases*100/population_2020))) %>%
mutate(rate_deaths_cases = as.numeric(sprintf("%0.4f", cumulative_deaths*100/cumulative_cases))) %>%
group_by(country) %>%
complete(date_reported = seq.Date(start_date, last_date, by="day")) %>%
ungroup()
top_covid <- function(data = NULL, # data frame
h = 10, # head
dt, # last date
ycol, # columns used in arrange
title, # title to plot
xlab, # xlab to plot
ylab # ylab to plot
){
#Pegando os top paÃses com mais casos no data frame
top_data <- data %>%
filter( date_reported == dt) %>%
arrange(desc(!!sym(ycol))) %>%
head(h) %>%
select(country)
#Filtrando os dados dos top paÃses no data frame
data_from_top <- data %>%
filter(country %in% top_data$country)
#Plotando o gráfico
p <- data_from_top %>%
ggplot( aes(x = date_reported, y = !!sym(ycol), colour = country) ) +
geom_line() +
ggtitle(title) +
xlab(xlab) +
ylab(ylab)
plotly::ggplotly(p)
}
top_covid(global_data, 10, last_date, "cumulative_deaths",
"Top 10 Countrys - Deaths Covid 19", "Date", "Cumulative Deaths")
top_covid(global_data, 10, last_date, "cumulative_cases",
"Top 10 Countrys - Cases Covid 19", "Date", "Cumulative Cases")
#Pegando os 10 paÃses com mais taxa de mortes sobre a população total
top_covid(global_data, 10, last_date, "rate_deaths_population",
"Top 10 Countrys - Death rate by population - Covid 19", "Date", "Death Rate")
#Pegando os 10 paÃses com mais taxa de infectados sobre a população total
top_covid(global_data, 10, last_date, "rate_cases_population",
"Top 10 Countrys - Infected rate by population - Covid 19", "Date", "Infected Rate")
#Pegando os 10 paÃses com mais taxa de infectados sobre a população total
top_covid(global_data, 10, last_date, "rate_deaths_cases",
"Top 10 Countrys - Deaths by Infected rate - Covid 19", "Date", "Death by Infected Rate")
#Carregando o nome das colunas
only_numeric_data <- select_if(global_data, is.numeric) %>%
na.omit()
## new_cases cumulative_cases new_deaths cumulative_deaths
## new_cases 1.00 0.78 0.78 0.67
## cumulative_cases 0.78 1.00 0.61 0.94
## new_deaths 0.78 0.61 1.00 0.60
## cumulative_deaths 0.67 0.94 0.60 1.00
## population_2020 0.20 0.18 0.14 0.14
## net_change 0.18 0.13 0.11 0.08
## density_p_km -0.02 -0.02 -0.02 -0.02
## land_area_km 0.41 0.37 0.32 0.31
## rate_deaths_population 0.14 0.26 0.19 0.40
## rate_cases_population 0.14 0.19 0.10 0.20
## rate_deaths_cases 0.06 0.10 0.13 0.20
## population_2020 net_change density_p_km land_area_km
## new_cases 0.20 0.18 -0.02 0.41
## cumulative_cases 0.18 0.13 -0.02 0.37
## new_deaths 0.14 0.11 -0.02 0.32
## cumulative_deaths 0.14 0.08 -0.02 0.31
## population_2020 1.00 0.86 -0.02 0.55
## net_change 0.86 1.00 -0.03 0.39
## density_p_km -0.02 -0.03 1.00 -0.07
## land_area_km 0.55 0.39 -0.07 1.00
## rate_deaths_population -0.03 -0.06 -0.01 -0.01
## rate_cases_population -0.06 -0.09 0.09 -0.05
## rate_deaths_cases 0.02 0.01 -0.06 0.02
## rate_deaths_population rate_cases_population
## new_cases 0.14 0.14
## cumulative_cases 0.26 0.19
## new_deaths 0.19 0.10
## cumulative_deaths 0.40 0.20
## population_2020 -0.03 -0.06
## net_change -0.06 -0.09
## density_p_km -0.01 0.09
## land_area_km -0.01 -0.05
## rate_deaths_population 1.00 0.63
## rate_cases_population 0.63 1.00
## rate_deaths_cases 0.32 0.06
## rate_deaths_cases
## new_cases 0.06
## cumulative_cases 0.10
## new_deaths 0.13
## cumulative_deaths 0.20
## population_2020 0.02
## net_change 0.01
## density_p_km -0.06
## land_area_km 0.02
## rate_deaths_population 0.32
## rate_cases_population 0.06
## rate_deaths_cases 1.00
graf_cases_deaths <-
global_data %>%
filter(country == "Brazil") %>%
ggplot(aes(y=new_cases, x=new_deaths)) +
geom_point(aes(col=new_cases)) +
geom_smooth(method="loess") +
labs(subtitle = "Novos Casos vs Novas Mortes - Brasil",
x = "Novas mortes", y = "Novos casos", color = "Casos vs Mortes")
plotly::ggplotly(graf_cases_deaths)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 46 rows containing non-finite values (stat_smooth).
#Regressão linear
lm(data = global_data, formula = cumulative_cases ~ cumulative_deaths)
##
## Call:
## lm(formula = cumulative_cases ~ cumulative_deaths, data = global_data)
##
## Coefficients:
## (Intercept) cumulative_deaths
## 1102.00 14.43